import pandas as pd
import seaborn as sns
import numpy as np
import re
from datetime import timedelta, date, datetime
import matplotlib.pyplot as plt
from io import StringIO
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore') # setting ignore as a parameter
d = pd.read_csv('Growth Analytics Challenge - Dataset Conversion Data.csv')
d.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11011 entries, 0 to 11010 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Touched Date 11011 non-null object 1 Utm Medium 11011 non-null object 2 Utm Source 11011 non-null object 3 Utm Campaign 9884 non-null object 4 # Demo Occurred 11011 non-null float64 5 # Sales Qualified Opportunity 11011 non-null float64 6 # Closed Won Opportunity 11011 non-null float64 dtypes: float64(3), object(4) memory usage: 602.3+ KB
d = pd.read_csv('Growth Analytics Challenge - Dataset Conversion Data.csv')
# Dates
d['Touched Date'] = pd.to_datetime(d['Touched Date'])
d['Week'] = d['Touched Date'].dt.strftime('%Y-%U')
d['Week'] = pd.to_datetime(d['Week'] + '-1', format='%Y-%U-%w')
d['Month'] = d['Touched Date'].dt.strftime('%Y-%m')
d['Quarter'] = d['Touched Date'].dt.to_period('Q')
d['Weekday'] = d['Touched Date'].dt.day_name()
weekday_map = {
'Sunday': '1. Sunday',
'Monday': '2. Monday',
'Tuesday': '3. Tuesday',
'Wednesday': '4. Wednesday',
'Thursday': '5. Thursday',
'Friday': '6. Friday',
'Saturday': '7. Saturday'
}
# Apply the mapping
d['Weekday'] = d['Weekday'].map(weekday_map)
# Utm Medium and Campaigns Unknown
# Standardize 'Utm Campaign' column formatting
d['Utm Campaign'] = d['Utm Campaign'].astype(str).str.replace('_', '-').str.upper()
# Define mapping of keywords to regions
region_keywords = {
"NAM": ["USA", "CANADA", "NORTHAMERICA", "GHG-USA", "GHG-CANADA", "NAM"],
"EMEA": ["FRANCE", "GERMANY", "UK", "SPAIN", "TURKEY", "ITALY", "NETHERLANDS", "SWEDEN",
"SOUTHAFRICA", "PORTUGAL", "HUNGARY", "RWANDA", "UAE", "ALBANIA", "GEORGIA", "SERBIA", "NIGERIA",
"EMEA", "BENELUX", "NORDICS", "GHG-FRANCE", "GHG-GERMANY", "GHG-UK", "GHG-ITALY", "GHG-SPAIN", "GHG-TURKEY"],
"APAC": ["INDIA", "PHILIPPINES", "JAPAN", "VIETNAM", "SRILANKA", "APAC", "GHG-INDIA", "GHG-VIETNAM", "GHG-SRILANKA", "GHG-PHILIPPINES"],
"LATAM": ["BRAZIL", "MEXICO", "COLOMBIA", "URUGUAY", "LATAM", "CHILE", "SOUTHLATAM", "BRAZILLATAM",
"GHG-BRAZIL", "GHG-MEXICO", "GHG-COLOMBIA", "GHG-URUGUAY"],
"AFRICA": ["SOUTHAFRICA", "NIGERIA", "RWANDA", "AFRICA", "GHG-SOUTHAFRICA", "GHG-NIGERIA", "GHG-RWANDA"],
"GLOBAL": ["GLOBAL", "WORLDWIDE", "WW", "FY23", "SEARCH-NONBRANDED", "DEEL", "BUSINESS-CASE", "INTERNATIONAL-HIRING-GUIDE"],
}
# Function to classify region using regex keyword search
def classify_region(utm_campaign):
for region, keywords in region_keywords.items():
if any(re.search(rf"\b{kw}\b", utm_campaign) for kw in keywords):
return region
return "UNKNOWN"
# Apply classification to 'Utm Campaign' column
d['Region'] = d['Utm Campaign'].apply(classify_region)
# ----------------------------------------------------------------------------------------
# Standardize Utm Medium
d['Utm Medium'] = d['Utm Medium'].str.upper()
d['Utm Medium'] = d['Utm Medium'].str.replace(r'_+', '_', regex=True).str.strip('_')
# Standardize Utm
grouping_map = {
'FB': 'FACEBOOK', 'IG': 'INSTAGRAM', 'LINKEDIN_POST': 'LINKEDIN', 'LINKEDIN': 'LINKEDIN',
'EMAIL_NEWSLETTER': 'NEWSLETTER', 'SPONSORED_NEWSLETTER': 'NEWSLETTER', 'PROSPECT_NEWSLETTER': 'NEWSLETTER', 'MQL_EMAIL': 'NEWSLETTER',
'PAID_SOCIAL': 'PAID SOCIAL', 'PAIDSOCIAL': 'PAID SOCIAL',
'PAID_SEARCH': 'PAID SEARCH', 'UTM_MEDIUM_PAID_SEARCH': 'PAID SEARCH', 'CPC': 'CPC', 'PPC': 'PPC',
'PAID_LEAD_BUY': 'PAID LEAD', 'UTM_MEDIUM_PAID_LEAD_BUY': 'PAID LEAD',
'WEBINAR': 'WEBINAR', 'PARTNER_WEBINAR': 'WEBINAR', 'PARTNER___WEBINAR': 'WEBINAR',
'EMAIL_SEQUENCE': 'EMAIL', 'EMAIL_NURTURE': 'EMAIL', 'EMAIL+SEQUENCE': 'EMAIL',
'TRADESHOW_SPONSORSHIP': 'TRADESHOW', 'TRADESHOW_ATTENDANCE_ONLY': 'TRADESHOW', 'EVENT': 'TRADESHOW',
'DISPLAY': 'DISPLAY & NATIVE ADS', 'BANNER': 'DISPLAY & NATIVE ADS', 'NATIVE': 'DISPLAY & NATIVE ADS', 'IN_APP': 'DISPLAY & NATIVE ADS',
'CONTENT': 'CONTENT MARKETING', 'ARTICLE': 'CONTENT MARKETING', 'SPONSORED_POST': 'CONTENT MARKETING', 'Pdr': 'CONTENT MARKETING',
}
d['Utm Medium Std'] = d['Utm Medium'].replace(grouping_map)
# Better categorize CPC by Utm Source
d.loc[d['Utm Medium Std'].isin(['CPC', 'CPM', 'PPC']), 'Utm Medium Std'] = d['Utm Medium'] + '-' + d['Utm Source'].str.upper()
cpc_map = {
'CPC-FACEBOOK': 'CPC-FACEBOOK',
'CPM-LINKEDIN': 'CPC-LINKEDIN',
'CPC-LINKEDIN': 'CPC-SOCIAL',
'CPC-GOOGLE': 'CPC-SEARCH',
'CPC-BING': 'CPC-SEARCH',
'CPC-PEOPLEMANAGINGPEOPLE': 'CPC-CONTENT',
'CPC-SAASWORTHY.COM': 'CPC-CONTENT',
'CPC-CAPTERRA': 'CPC-CONTENT',
'CPC-RECRUITERSLINEUP': 'CPC-CONTENT',
'CPC-YCALUMNI': 'CPC-CONTENT',
'CPC-SELECTSOFTWAREREVIEWS': 'CPC-CONTENT',
'PPC-SELECTSOFTWAREREVIEWS': 'CPC-CONTENT',
'CPC-HRMAUSTRALIA': 'CPC-CONTENT',
'CPC-BLIND-ADS': 'CPC-CONTENT',
'CPC-MATCHR': 'CPC-CONTENT',
'CPC-NATURALINT': 'CPC-CONTENT',
'CPC-MVF-GLOBAL': 'CPC-CONTENT'
}
d['Utm Medium Std'] = d['Utm Medium Std'].replace(cpc_map)
# ----------------------------------------------------------------------------------------
# Utm channel
def categorize_channel(channel):
if any(x in channel for x in ['SEARCH', 'CPC-SEARCH', 'CPM', 'SPONSORED', 'PPC']):
return 'Paid-Search'
elif any(x in channel for x in ['EMAIL', 'NEWSLETTER', 'SEQUENCE', 'NURTURE']):
return 'Email Marketing & Newsletters'
elif any(x in channel for x in ['SOCIAL', 'FACEBOOK', 'LINKEDIN', 'INSTAGRAM', 'ZALO', 'CPC-SOCIAL']):
return 'Paid-Social'
elif any(x in channel for x in ['EVENT', 'TRADESHOW', 'WEBINAR']):
return 'Events & Webinars'
elif any(x in channel for x in ['REFERRAL', 'PARTNER', 'DIRECT-REFERRAL']):
return 'Referrals & Partnerships'
elif any(x in channel for x in ['DISPLAY', 'BANNER', 'NATIVE', 'IN-APP', 'DISPLAY & NATIVE ADS']):
return 'Display & Native Ads'
elif any(x in channel for x in ['CONTENT MARKETING', 'CPC-CONTENT', 'PAID-CONTENT-SYNDICATION', 'PAID-LISTICLE']):
return 'Content Marketing'
elif any(x in channel for x in ['COMPANY_PROFILE', 'FREE-COMPANY-PROFILE', 'PAID-COMPANY-LISTING']):
return 'Referrals & Partnerships'
elif any(x in channel for x in ['SURVEY', 'SELF-SIGNUP', 'APP', 'NONPAID', 'OTHER', 'ALL', 'LINKTREE']):
return 'Organic, Outbound & Other'
elif any(x in channel for x in ['ASHBY REC OPS', 'EDDY', 'AE', 'KRISTOPHER', 'LEA', 'JONATHAN', 'FP']):
return 'Organic, Outbound & Other'
else:
return 'Organic, Outbound & Other'
def categorize_top_channel(top_channel):
if any(x in top_channel.upper() for x in ['SEARCH', 'CPC-SEARCH', 'CPM', 'SPONSORED', 'PPC']):
return 'Paid-Search'
elif any(x in top_channel.upper() for x in ['SOCIAL', 'FACEBOOK', 'LINKEDIN', 'INSTAGRAM', 'ZALO', 'CPC-SOCIAL']):
return 'Paid-Social'
else:
return 'Other'
d['Utm Channel'] = d['Utm Medium Std'].apply(categorize_channel)
d['Channel key'] = d['Utm Channel'].str.upper() + '-' + d['Utm Source'].str.upper()
d['Utm Campaign'].fillna(d['Channel key'], inplace=True)
# Rename Columns
d.rename(columns={
'# Demo Occurred': 'Demo',
'# Sales Qualified Opportunity': 'Qualified_opp',
'# Closed Won Opportunity': 'Won',
'Utm Campaign': 'Campaign',
'Utm Medium': 'Medium',
'Utm Source': 'Source',
'Utm Medium Std': 'Medium Std',
'Utm Channel': 'Channel'
}, inplace=True)
d.to_csv('Deel_Conversion.csv', index=False)
# Display unique values of 'Utm Medium'
d.head()
| Touched Date | Medium | Source | Campaign | Demo | Qualified_opp | Won | Week | Month | Quarter | Weekday | Region | Medium Std | Channel | Channel key | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-10-29 | PAID-SEARCH | NAM-T1-ACQ-SEARCHNONBRANDED-GOOGLE-SEARCH-COUN... | 2.5 | 0.0 | 0.0 | 2023-10-30 | 2023-10 | 2023Q4 | 1. Sunday | NAM | PAID-SEARCH | Paid-Search | PAID-SEARCH-GOOGLE | |
| 1 | 2023-10-29 | PAID-LISTICLE | peoplemanagingpeople | WW-ACQ-PROSP-PMP-PAIDLIST-CADPAYROLL-PMP---ALL | 2.5 | 2.5 | 0.0 | 2023-10-30 | 2023-10 | 2023Q4 | 1. Sunday | GLOBAL | PAID-LISTICLE | Content Marketing | CONTENT MARKETING-PEOPLEMANAGINGPEOPLE |
| 2 | 2023-10-29 | PAID-SOCIAL | NAM-T1-CONSIDER-CALCULATE-FB-IMAGE-ALL-REENGAG... | 2.4 | 0.0 | 0.0 | 2023-10-30 | 2023-10 | 2023Q4 | 1. Sunday | NAM | PAID-SOCIAL | Paid-Social | PAID-SOCIAL-FACEBOOK | |
| 3 | 2023-10-29 | PAID-SEARCH | EMEA-T1A-ACQ-SEARCHNONBRANDED-GOOGLE-SEARCH-PA... | 0.0 | 0.0 | 0.0 | 2023-10-30 | 2023-10 | 2023Q4 | 1. Sunday | EMEA | PAID-SEARCH | Paid-Search | PAID-SEARCH-GOOGLE | |
| 4 | 2023-10-29 | SPONSORED-NEWSLETTER | hrci | WW-ENGAGE-DOWNLOAD-HRCI-SPONNEWSLETTER-HR-HRBU... | 0.0 | 0.0 | 0.0 | 2023-10-30 | 2023-10 | 2023Q4 | 1. Sunday | GLOBAL | SPONSORED-NEWSLETTER | Paid-Search | PAID-SEARCH-HRCI |
# Aggregate by Region
dr = d.groupby('Region').agg({'Demo': 'sum', 'Qualified_opp': 'sum', 'Won': 'sum'}).reset_index()
dr['Qualified_opp_Rate'] = dr['Qualified_opp'] / dr['Demo'] * 100 # Qualified Opps / Demo
dr['Won_Rate'] = dr['Won'] / dr['Qualified_opp'] * 100 # Won / Demo
dr['Is_Overall'] = False # Mark as regional data
# Calculate Overall Totals
overall = dr.sum(numeric_only=True) # Sum all numeric columns
overall["Region"] = "Overall"
overall["Qualified_opp_Rate"] = overall["Qualified_opp"] / overall["Demo"] * 100
overall["Won_Rate"] = overall["Won"] / overall["Qualified_opp"] * 100
overall["Is_Overall"] = True # Mark as overall data
# Append Overall Data
dr = pd.concat([dr, pd.DataFrame([overall])], ignore_index=True)
# Convert Leads Data to Long Format
dr1 = dr.melt(id_vars=['Region', 'Is_Overall'], value_vars=['Demo', 'Qualified_opp', 'Won'],
var_name='Funnel_Stage', value_name='Value')
dr1['Type'] = 'Leads'
dr1['Value'] = dr1['Value'].round(0).astype(int)
# Create conversion rate rows
conv_rate_rows = []
for _, row in dr.iterrows():
conv_rate_rows.append({"Region": row["Region"], "Funnel_Stage": "Demo", "Value": 100, "Type": "Conversion rate", "Is_Overall": row["Is_Overall"]})
conv_rate_rows.append({"Region": row["Region"], "Funnel_Stage": "Qualified_opp", "Value": row["Qualified_opp_Rate"], "Type": "Conversion rate", "Is_Overall": row["Is_Overall"]})
conv_rate_rows.append({"Region": row["Region"], "Funnel_Stage": "Won", "Value": row["Won_Rate"], "Type": "Conversion rate", "Is_Overall": row["Is_Overall"]})
# Convert to DataFrame
conv_rate_df = pd.DataFrame(conv_rate_rows)
conv_rate_df['Value'] = conv_rate_df['Value'].round(0).astype(int)
# Append conversion rate rows
dr2 = pd.concat([dr1, conv_rate_df], ignore_index=True)
dr2.sort_values(by=["Value", "Region"], ascending=[False, True], inplace=True)
# Merge Leads & Conversion Rates
df_leads = dr2[dr2["Type"] == "Leads"]
df_rates = dr2[dr2["Type"] == "Conversion rate"]
# Create a dictionary for conversion rates
rate_dict = {(row.Region, row.Funnel_Stage): row.Value for _, row in df_rates.iterrows()}
# Map conversion rates into the leads table
df_leads["Conversion_Rate"] = df_leads.apply(lambda row: rate_dict.get((row.Region, row.Funnel_Stage), 1), axis=1)
# Create the Funnel Chart
fig = px.funnel(df_leads,
y="Funnel_Stage",
x="Value",
color="Region",
facet_col="Is_Overall", # Splitting by Overall vs. Regions
template="plotly_white",
text=df_leads["Conversion_Rate"].apply(lambda x: f"{x}%"),
title="Funnel by Region (with Overall Comparison)",
labels={"Funnel_Stage": "Funnel Stage", "Value": "Leads"})
# Set thresholds
min_font_size = 12 # Minimum font size for visibility
hide_threshold = 500 # Hide labels for bars smaller than this
# Dynamically adjust text position & visibility
for trace in fig.data:
trace.textposition = [
"inside" if v > hide_threshold else "none" # Hides small labels
for v in trace.x
]
trace.textfont = dict(size=min_font_size)
# Adjust layout
fig.update_layout(margin=dict(l=80, r=20, t=60, b=60),
title_x=0.5)
fig.show()
df_leads_ = df_leads[df_leads["Is_Overall"] == False]
# By Region
fig2 = px.funnel(df_leads_,
y="Funnel_Stage",
x="Value",
color="Region",
facet_col="Region",
facet_col_wrap=3,
template="plotly_white",
text=df_leads_["Conversion_Rate"].apply(lambda x: f"{x}%"),
title="Funnel by Region",
labels={"Funnel_Stage": "Funnel Stage", "Value": "Leads"})
# Customize text formatting
fig2.update_traces(textposition="auto", texttemplate="<b>%{x}</b> (%{text})")
# Adjust layout
fig2.update_layout(margin=dict(l=80, r=20, t=60, b=60),
title_x=0.5)
fig2.show()
df_leads.to_csv('Deel_Conversion_pivot.csv', index=False)
import nbconvert
import os
os.system("jupyter nbconvert --to html py_Analysis.ipynb")
0